# Name:     Denis Stukal
# Date:     November 3, 2021
# Summary:  Replication code for "Why Botter: How Pro-Government Bots Fight Opposition in Russia"

Sys.setlocale("LC_ALL", "ru_RU.UTF-8")

rm(list = ls())
gc()

# The commented out code below installs all required packages if necessary
# installed_packages <- installed.packages()[,1]
# required_packages <- c('bit64', 'data.table', 'lubridate', 'openxlsx', 'ggplot2', 'MASS', 'Matrix', 'lme4', 'optimx', 'dfoptim', 'plyr', 'heavy', 'coda', 'stm', 'tm')
# abscent_packages <- setdiff(x = required_packages, y = installed_packages)
# for (pack in abscent_packages) {
#   install.packages(pack)
# }

library(bit64)
library(data.table)
library(lubridate)
library(openxlsx)
library(ggplot2)

library(MASS)
library(Matrix)
library(lme4)
library(optimx)
library(dfoptim)

library(plyr)
library(heavy) # rmnorm
library(coda) # HPD

library(stm) # Fig 17-18
library(tm)  # Fig 17-18

source('helper_functions_apsr.R')

TOTAL_NUM_MODELS = 191 # total number of estimated regression models 

#------------------ SECTION 1: LOAD DATA
# Load bot tweet data
tw <- get( load('tw.RData') )

# Load protest data
protest <- list()
protest$raw_data <- read.table(file = 'protest_data_replication.txt', header = T, sep = '\t')

# Load data on opposition activists
optw <- fread(file = 'opposition.txt', sep = '\t', colClasses = rep('character', 3))


#------------------ SECTION 2: PRE-PROCESS DATA
##### Get total number of tweets per bot
num_tw <- tw[, .('num_tw' = .N), by = 'userid']
tw <- merge(x = tw, y = num_tw, by = 'userid')

##### Add month sequence numbers (month_num)
tw <- add_month_sequence_number(tw)

##### Add protest data
data_merged <- list()

# Validated ICEWS dataset
protest$dates_main <- add_lags_leads_to_protest(protest_dates = subset(protest$raw_data, dataset == 'icews')$date, shift_range = c(-7, 7))
# Extended protest dates from ICEWS, MMAD, Lankina's datasets
protest$dates_extended <- add_lags_leads_to_protest(protest_dates = protest$raw_data$date, shift_range = c(-7, 7))

data_merged$main <- add_protest_data(tw_data = tw, protest_data = protest$dates_main)
data_merged$extended <- add_protest_data(tw_data = tw, protest_data = protest$dates_extended)

data_merged$main <- add_cumulative_lags_leads(tw_data = data_merged$main, shift_range = c(-7, 7))
data_merged$extended <- add_cumulative_lags_leads(tw_data = data_merged$extended, shift_range = c(-7, 7))


##### Identify peaks in opposition data
agg_op <- optw[, .('opsum' = .N), by = 'date']
agg_op <- get_moving_median(data = agg_op, variable = 'opsum', window_size = 14)
agg_op$date <- as.Date(agg_op$date)
agg_op$peak5 <- ifelse(agg_op$opsum/agg_op$opsum_movmed >= 5, 1, 0)

peak_data <- subset(agg_op, select = c(date, opsum, peak5))
peak_data <- peak_data[order(peak_data$date, decreasing = F),]
rownames(peak_data) <- NULL

### Prepare data for Fig.16
# Merge dates, protest dates, dates of peaks, and totals of opposition activists' tweets per day
fig16_data <- data.frame('date' = unique(data_merged$main$date), stringsAsFactors = F)
fig16_data <- merge(x = fig16_data, y = peak_data, by = 'date', all = T)
fig16_data$protest <- ifelse(fig16_data$date %in% as.Date(subset(protest$raw_data, dataset == 'icews')$date), 1, 0)
fig16_data[is.na(fig16_data)] <- 0
# Make a list: all dates with opposition activists' totals, and only days of protest or peaks (for dots in Fig.16)
fig16_data <- list('oppos_num_tw' = fig16_data)
fig16_data$daily <- subset(fig16_data$oppos_num_tw, peak5 == 1 | protest == 1)
fig16_data$daily$event <- ifelse(fig16_data$daily$peak5 == 1, 'spike', ifelse(fig16_data$daily$protest == 1, 'protest', NA))
fig16_data$daily$event[fig16_data$daily$protest == 1 & fig16_data$daily$peak5 == 1] <- 'both'
fig16_data$daily$event <- factor(fig16_data$daily$event, levels = c('protest', 'spike', 'both'))
rownames(fig16_data$daily) <- NULL

##### Get placebo days
# Concatenate protest and peak data (main analysis)
protest_or_peak <- unique( c(get_protest_dates(main = T), get_peak_dates()) )
protest_or_peak <- as.Date(protest_or_peak[order(protest_or_peak, decreasing = F)])

# Concatenate protest and peak data (Appendix J analysis)
protest_or_peak_extended <- unique( c(get_protest_dates(main = F), get_peak_dates()) )
protest_or_peak_extended <- as.Date(protest_or_peak_extended[order(protest_or_peak_extended, decreasing = F)])

# Get protest/peak periods (protest/peak +- 3 days ) and eligible days for placebos (main analysis)
all_dates <- unique(data_merged$main$date)
protest_or_peak_periods <- as.Date( unique(as.vector( sapply(protest_or_peak, function(k) seq(from = k - 3, to = k + 3, by = 1)) )) , origin = '1970-01-01')  
eligible_dates <- all_dates[!all_dates %in% protest_or_peak_periods] 

# Get protest/peak periods (protest/peak +- 3 days ) and eligible days for placebos (Appendix J)
protest_or_peak_periods_extended <- as.Date( unique(as.vector( sapply(protest_or_peak_extended, function(k) seq(from = k - 3, to = k + 3, by = 1)) )) , origin = '1970-01-01')  
eligible_dates_extended <- all_dates[!all_dates %in% protest_or_peak_periods_extended] 

# Sample placebo days
set.seed(66666)
placebo_dates <- sample(x = eligible_dates, size = 10, replace = F)
placebo_dates_extended <- sample(x = eligible_dates_extended, size = 10, replace = F)

# Fix peak_data
peak_data$date <- as.Date(peak_data$date)

##### Make data for regression analysis
reg_data   <- make_data_reg(data = data_merged$main, placebo_dates = placebo_dates, opposition_data = peak_data)
reg_data_extended   <- make_data_reg(data = data_merged$extended, placebo_dates = placebo_dates_extended, opposition_data = peak_data)
reg_data_sent <- make_data_reg(data = data_merged$main, placebo_dates = placebo_dates, opposition_data = peak_data, sentiment = T) # for Appendix L

#------------------ SECTION 3: MAIN ANALYSIS
##### Main part: No lags/leads
# Set up data (rename explanatory variables)
reg_data_main <- setup_data_analysis(data = reg_data, curprotest = 'protest', curpeak = 'peak5', curplacebo = 'placebo')

# Run models
models_main <- list()
models_main$ntw <- glmer(ntw ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                         data = reg_data_main, family = 'poisson',
                         control = glmerControl(optimizer = 'optimx', optCtrl  = list(method="nlminb"), 
                                                check.conv.grad = .makeCC("warning", tol = 7e-3)))

models_main$rtd <- glmer(rtd ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                         data = reg_data_main, family = 'poisson',
                         control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))


models_main$put <- glmer(put_sum ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                         data = reg_data_main, family = 'poisson',
                         control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))


models_main$nav <- glmer(nav_sum ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                         data = reg_data_main, family = 'poisson',
                         control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))


##### Lags 1-3 in protest or peaks (time: 16.5 h)
# Set up data (rename explanatory variables)
LAG_MIN = 1
LAG_MAX = 7
reg_data_lags <- list()
for (i in LAG_MIN:LAG_MAX) {
  reg_data_lags[[paste0('protest_lag', i)]] <- setup_data_analysis(data = reg_data, curprotest = paste0('protest_lag', i, '_cumul'), curpeak = 'peak5', curplacebo = 'placebo')
  reg_data_lags[[paste0('peak_lag', i)]] <- setup_data_analysis(data = reg_data, curprotest = 'protest', curpeak = paste0('peak_lag', i, '_cumul'), curplacebo = 'placebo')
}

# Run models
models_main_lags <- list()
for (i in LAG_MIN:LAG_MAX) {
  # Models for four dependent variables with protest lags
  models_main_lags[[paste0('protest_lag', i, '_ntw')]] <- update(models_main$ntw, ntw ~ .,     data = reg_data_lags[[paste0('protest_lag', i)]] )
  models_main_lags[[paste0('protest_lag', i, '_rtd')]] <- update(models_main$ntw, rtd ~ .,     data = reg_data_lags[[paste0('protest_lag', i)]] )
  models_main_lags[[paste0('protest_lag', i, '_put')]] <- update(models_main$ntw, put_sum ~ ., data = reg_data_lags[[paste0('protest_lag', i)]] )
  models_main_lags[[paste0('protest_lag', i, '_nav')]] <- update(models_main$ntw, nav_sum ~ ., data = reg_data_lags[[paste0('protest_lag', i)]] )
  
  # Models for four dependent variables with peak lags
  models_main_lags[[paste0('peak_lag', i, '_ntw')]] <- update(models_main$ntw, ntw ~ .,     data = reg_data_lags[[paste0('peak_lag', i)]] )
  models_main_lags[[paste0('peak_lag', i, '_rtd')]] <- update(models_main$ntw, rtd ~ .,     data = reg_data_lags[[paste0('peak_lag', i)]] )
  models_main_lags[[paste0('peak_lag', i, '_put')]] <- update(models_main$ntw, put_sum ~ ., data = reg_data_lags[[paste0('peak_lag', i)]] )
  models_main_lags[[paste0('peak_lag', i, '_nav')]] <- update(models_main$ntw, nav_sum ~ ., data = reg_data_lags[[paste0('peak_lag', i)]] )
}

##### Leads 1-3 in protest or peaks (time: 16 h)
# Set up data (rename explanatory variables)
LEAD_MIN = 1
LEAD_MAX = 7
reg_data_leads <- list()
for (i in LEAD_MIN:LEAD_MAX) {
  reg_data_leads[[paste0('protest_lead', i)]] <- setup_data_analysis(data = reg_data, curprotest = paste0('protest_lead', i, '_cumul'), curpeak = 'peak5', curplacebo = 'placebo')
  reg_data_leads[[paste0('peak_lead', i)]] <- setup_data_analysis(data = reg_data, curprotest = 'protest', curpeak = paste0('peak_lead', i, '_cumul'), curplacebo = 'placebo')
}

# Run models
models_main_leads <- list()
for (i in LEAD_MIN:LEAD_MAX) {
  # Models for four dependent variables with protest leads
  models_main_leads[[paste0('protest_lead', i, '_ntw')]] <- update(models_main$ntw, ntw ~ .,     data = reg_data_lags[[paste0('protest_lead', i)]] )
  models_main_leads[[paste0('protest_lead', i, '_rtd')]] <- update(models_main$ntw, rtd ~ .,     data = reg_data_lags[[paste0('protest_lead', i)]] )
  models_main_leads[[paste0('protest_lead', i, '_put')]] <- update(models_main$ntw, put_sum ~ ., data = reg_data_lags[[paste0('protest_lead', i)]] )
  models_main_leads[[paste0('protest_lead', i, '_nav')]] <- update(models_main$ntw, nav_sum ~ ., data = reg_data_lags[[paste0('protest_lead', i)]] )
  
  # Models for four dependent variables with peak leads
  models_main_leads[[paste0('peak_lead', i, '_ntw')]] <- update(models_main$ntw, ntw ~ .,     data = reg_data_lags[[paste0('peak_lead', i)]] )
  models_main_leads[[paste0('peak_lead', i, '_rtd')]] <- update(models_main$ntw, rtd ~ .,     data = reg_data_lags[[paste0('peak_lead', i)]] )
  models_main_leads[[paste0('peak_lead', i, '_put')]] <- update(models_main$ntw, put_sum ~ ., data = reg_data_lags[[paste0('peak_lead', i)]] )
  models_main_leads[[paste0('peak_lead', i, '_nav')]] <- update(models_main$ntw, nav_sum ~ ., data = reg_data_lags[[paste0('peak_lead', i)]] )
  
}

##### Process model results for plotting
# The input datasets were produced above. 
# For convenience, they are also provided and can be loaded with the following commands:
# reg_data_main <- get( load('reg_data_main.RData') )
# reg_data_leads <- get( load('reg_data_leads.RData') )
# reg_data_lags <- get( load('reg_data_lags.RData') )
# models_main <- get( load('models_main.RData') )
# models_main_leads <- get( load('models_main_leads.RData') )
# models_main_lags<- get( load('models_main_lags.RData') )

# Restructure all results and data into 2 lists
models <- do.call(c, list(models_main, models_main_lags, models_main_leads))
reg_data_main <- list('main' = reg_data_main)
reg_datasets <- do.call(c, list(reg_data_main, reg_data_leads, reg_data_lags))

# Compute SAPD and store the results for ggplotting
sapd_data <- compute_sapd(models = models, reg_datasets = reg_datasets, BONFE_NUM = TOTAL_NUM_MODELS) # 30 min
datplot <- lapply(names(sapd_data), function(k) sapd_data[[k]]$datplot)
datplot <- do.call('rbind', datplot)

##### Figure 2 (Volume of tweets: Effect size)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'ntw', shift = 3, lags = T, leads = T)
fig2 <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 2) 
rm(ggplot_data)


##### Figure 3 (Retweet diversity: Effect size)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'rtd', shift = 3, lags = T, leads = T)
fig3 <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 2) 
rm(ggplot_data)


##### Figure 4 (Cheerleading: Effect size)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'put', shift = 3, lags = T, leads = T)
fig4 <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 1) 
rm(ggplot_data)


##### Figure 5 (Negative campaigning: Effect size)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'nav', shift = 3, lags = T, leads = T)
fig5 <- make_quick_plot(dat = ggplot_data,  title = '', xaxis_label = NULL, xmax = 0.5) 
rm(ggplot_data)


#--------------- SECTION 4: Appendix F "Days before and after" (Fig. 3ap - 10ap)
# NB! Fig 1ap and 2ap (App. D and E) are flowcharts created w/o any data
# This section provides the code for making Fig 3ap - 15ap
# The input data is datplot created in the previous section and used for making Fig 2-5. 
# For time considerations, we provide the datplot_main_lags7_leads7.txt file that can be loaded as follows:
# datplot <- read.table(file = 'datplot_main_lags7_leads7.txt', header = T, sep = '\t')

# Fig.1ap (volume: 7 days before events)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'ntw', shift = 7, lags = T, leads = F)
fig1ap <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 2) 
rm(ggplot_data)

# Fig.2ap (retweet diversity: 7 days before events)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'rtd', shift = 7, lags = T, leads = F)
fig2ap <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 2) 
rm(ggplot_data)

# Fig.3ap (cheerleading: 7 days before events)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'put', shift = 7, lags = T, leads = F)
fig3ap <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 2) 
rm(ggplot_data)

# Fig.4ap (negative campaigning: 7 days before events)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'nav', shift = 7, lags = T, leads = F)
fig4ap <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 2) 
rm(ggplot_data)

# Fig.5ap (volume: 7 days before events)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'ntw', shift = 7, lags = F, leads = T)
fig5ap <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 2) 
rm(ggplot_data)

# Fig.6ap (retweet diversity: 7 days before events)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'rtd', shift = 7, lags = F, leads = T)
fig6ap <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 2) 
rm(ggplot_data)

# Fig.7ap (cheerleading: 7 days before events)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'put', shift = 7, lags = F, leads = T)
fig7ap <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 2) 
rm(ggplot_data)

# Fig.8ap (negative campaigning: 7 days before events)
ggplot_data <- make_data_for_plot(dat = datplot, block = 'nav', shift = 7, lags = F, leads = T)
fig8ap <- make_quick_plot(dat = ggplot_data, title = '', xaxis_label = NULL, xmax = 2) 
rm(ggplot_data)


#--------------- SECTION 5: Appendix G "Identifiability assumption" (Fig. 11ap - 13ap)
##### Fig 11ap (separability of protests and spikes)
# The input data for Fig 16 was generated in Section 2. For convenience, we provide files for loading this data as follows:
# fig16_data <- lapply(c('oppos_num_tw', 'daily'), function(k) read.table(file = paste0('fig16_data_', k, '.txt'), header = T, sep = '\t') )
# names(fig16_data) <- c('oppos_num_tw', 'daily')

fig11ap <- ggplot(data = fig16_data$oppos_num_tw, aes(x = date, y = opsum)) +
  geom_line(color = 'grey30') +   # 'dodgerblue4'
  geom_point(data = fig16_data$daily, aes(x = date, y = opsum,
                            color = event,
                            shape = event,
                            size = event  )) +
  ylab('Number of tweets\nby opposition activists') + # 'Number of tweets\nby pro-opposition bots
  xlab('') +
  theme_bw() +
  theme(axis.text = element_text(color = 'black', size = 12),
        axis.title = element_text(size = 12), 
        legend.text=element_text(size=12),
        legend.title=element_text(size=12) ) + 
  scale_color_manual(name = "event", values = c('red', 'green2', 'royalblue2')) +
  scale_shape_manual(name = "event", values = c(16, 15, 17)) + 
  scale_size_manual(name = "event", values = c(4, 3, 3)) +
  guides(color=guide_legend(title="Event types"), shape=guide_legend(title="Event types"), size=guide_legend(title="Event types"))

###### Fig 12ap and 13ap
# Prepare the dataset for creating the document-text matrix 
data_merged$main$lemma <- gsub(pattern = '^rt', '', data_merged$main$lemma) # remove RT from lemmas
data_merged$main$peak5 <- ifelse(data_merged$main$date %in% peak_data$date[peak_data$peak5 == 1], 1, 0)

data_merged$main <- as.data.frame(data_merged$main)
data_for_stm <- data_merged$main

# Pre-process text (time: 12 mins)
temp <- textProcessor(documents = data_for_stm$lemma, 
                      metadata = data_for_stm, 
                      sparselevel = 0.999, 
                      removestopwords = T, removenumbers = F, removepunctuation = T, 
                      stem = F, language = 'ru')

# Prepare a dataset for LDA (0.5 min)
out999 <- prepDocuments(temp$documents, temp$vocab, temp$meta)


# Select LDA model for days with protest (time: 16 h)
NUMTOPICS = 35
stm_models_protest <- selectModel(documents = out999$documents, vocab = out999$vocab,
                         prevalence = ~ protest, data = out999$meta,
                         max.em.its = 500, net.max.em.its = 5, seed = 80720192,
                         K = NUMTOPICS, init.type = 'LDA', runs = 15)

stm_models_peaks <- selectModel(documents = out999$documents, vocab = out999$vocab,
                         prevalence = ~ peak5, data = out999$meta,
                         max.em.its = 500, net.max.em.its = 5, seed = 80720192,
                         K = numtopics, init.type = 'LDA', runs = 15)

# Select models
# The input object were produced above; they are also provided and can be loaded with the following commands:
# stm_models_protest <- get( load( 'stm35_sparse999_protest.RData' ) ) 
# stm_models_peaks <- get( load( 'stm35_sparse999_peak.RData' ) )  
# out999 <- get( load( 'out999.RData' ) ) 
best_models <- list()
best_models$protest <- stm_models_protest$runout[[3]]
best_models$peaks <- stm_models_peaks$runout[[3]]

# Set up data for Fig 17 and 18
topic_distrib <- list()
topic_medians <- list()

protest_dates <- get_protest_dates()
peak_dates <- get_peak_dates()

# Summarize topical distribution per protest/peak
# Put together meta-data and topical distribution from the estimated stm object
# Rows are tweets, Cols are topic nums, entries are probs
topic_distrib$protest <- cbind(subset(out999$meta, select = c(date, lemma, protest)), best_models$protest$theta)
topic_distrib$peaks <- cbind(subset(out999$meta, select = c(date, lemma, peak5)), best_models$peaks$theta)

# Add protest and peak numbers to the dataset
topic_distrib$protest <- make_protest_numbers(topic_distrib$protest, protest_dates = protest_dates)
topic_distrib$peaks <- make_peak_numbers(topic_distrib$peaks, peak_dates = peak_dates)

# Get topic probability median for each protest day
topic_medians$protest <- get_median_topic_prob_by_event(topic_data = topic_distrib$protest, protest = T)
topic_medians$peaks <- get_median_topic_prob_by_event(topic_data = topic_distrib$peaks, protest = F)

# Make Fig 12ap
# The input data (topic_medians) was generated above in this section. For convenience, we provide
# the stm_topic_medians.RData file that can be loaded with load('stm_topic_medians.RData')

fig12ap <- ggplot(data = topic_medians$protest, aes(x=event, y=topic, fill=value)) + 
  geom_tile() + 
  xlab('Protest number') + 
  ylab('Topic number') + 
  guides(fill=guide_legend(title='Median Pr(Topic | Document)', direction = "horizontal", title.position = 'top'))  +
  theme(legend.position="bottom")

# Make Fig 13ap
# The input data (topic_medians) was generated above in this section. For convenience, we provide
# the stm_topic_medians.RData file that can be loaded with load('stm_topic_medians.RData')

fig13ap <- ggplot(data = topic_medians$peaks, aes(x=event, y=topic, fill=value)) + 
  geom_tile() + 
  xlab('Peak number') + 
  ylab('Topic number') + 
  guides(fill=guide_legend(title='Median Pr(Topic | Document)', direction = "horizontal", title.position = 'top'))  +
  theme(legend.position="bottom")

# Label topics prominent during protests
topic_frex_protest <- get_frex_topics(model = best_models$protest, nwords = 7)
topic_frex_protest[[paste0('topic_', 12)]]
findThoughts(best_models$protest, texts = out999$meta$lemma, n = 10, topics = 12)$docs[[1]]

# Label topics prominent during online spikes
topic_frex_spikes <- get_frex_topics(model = best_models$peaks, nwords = 7)
topic_frex_spikes[[paste0('topic_', 23)]]
topic_frex_spikes[[paste0('topic_', 15)]]
findThoughts(best_models$peaks, texts = out999$meta$lemma, n = 10, topics = 23)$docs[[1]]
findThoughts(best_models$peaks, texts = out999$meta$lemma, n = 10, topics = 15)$docs[[1]]


#--------------- SECTION 6: Appendix J 'Augmented Protest Dataset' (Fig. 15ap - 18ap)
# NB! Fig 14ap is a causal graph that does not require any data and can be drawn with any appropriate software
# The input dataset (reg_data_extended) was produced above in Section 2.
# For convenience, this dataset is provided as the reg_data_extended.RData file
# that can be load with reg_data_extended <- get( load('reg_data_extended.RData') )

##### Set up data (rename explanatory variables)
reg_data_extended <- setup_data_analysis(data = reg_data_extended, curprotest = 'protest', curpeak = 'peak5', curplacebo = 'placebo')

##### Run models (time: 1.5 h)
models_extended <- list()
models_extended$ntw_extended <- glmer(ntw ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                             data = reg_data_extended, family = 'poisson',
                             control = glmerControl(optimizer = 'optimx', optCtrl  = list(method="nlminb"), 
                                                    check.conv.grad = .makeCC("warning", tol = 7e-3)))

models_extended$rtd_extended <- glmer(rtd ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                             data = reg_data_extended, family = 'poisson',
                             control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))


models_extended$put_extended <- glmer(put_sum ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                             data = reg_data_extended, family = 'poisson',
                             control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))


models_extended$nav_extended <- glmer(nav_sum ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                             data = reg_data_extended, family = 'poisson',
                             control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))


##### Compute SAPD
# The input datasets were produced above in this Section and Section 3.
# For convenience, they are also provided and can be loaded with the following commands:
# models_extended <- get( load('models_extended.RData') )
# models_main <- get( load('models_main.RData') )
# reg_data_main <- get( load('reg_data_main.RData') )
# reg_data_extended <- get( load('reg_data_extended.RData') )

# Restructure all results and data into 2 lists
models <- do.call(c, list(models_main, models_extended))

reg_data_main <- list('main' = reg_data_main)
reg_data_extended <- setup_data_analysis(data = reg_data_extended)
reg_data_extended <- list('extended' = reg_data_extended)
reg_datasets <- do.call(c, list(reg_data_main, reg_data_extended))

# Compute SAPD and store the results for ggplotting
sapd_data_extended <- compute_sapd(models = models, reg_datasets = reg_datasets, BONFE_NUM = TOTAL_NUM_MODELS)
datplot_extended <- lapply(names(sapd_data_extended), function(k) sapd_data_extended[[k]]$datplot)
datplot_extended <- do.call('rbind', datplot_extended)

##### Make Fig 15ap - 18ap
# The input data (datplot_extended) was produced above in this section.
# For convenience, it is also provided as the datplot_extended.txt file 
# that can be loaded with datplot_extended <- read.table(file = 'datplot_extended.txt', header = T, sep = '\t')

fig15ap <- make_quick_plot_extended(data = make_data_for_plot_extended(data = datplot_extended, block = 'ntw'), xmax = 1.5)
fig16ap <- make_quick_plot_extended(data = make_data_for_plot_extended(data = datplot_extended, block = 'rtd'))
fig17ap <- make_quick_plot_extended(data = make_data_for_plot_extended(data = datplot_extended, block = 'put'))
fig18ap <- make_quick_plot_extended(data = make_data_for_plot_extended(data = datplot_extended, block = 'nav'), xmax = 0.5)

#--------------- SECTION 7: Appendix K 'Subsets of activists' (Fig. 19ap - 22ap)

##### Make random subsets of opposition activists
set.seed(223344)
opsets <- lapply(1:10, function(k) sample(x = optw$userid, size = 10))

optw_sets   <- lapply(1:length(opsets), function(k) subset(optw, userid %in% opsets[[k]])  )
agg_op_sets <- lapply(1:length(opsets), function(k) optw_sets[[k]][, .('opsum' = .N), by = 'date'] )
agg_op_sets <- lapply(1:length(opsets), function(k) get_moving_median(data = agg_op_sets[[k]], variable = 'opsum', window_size = 14) )

##### Prepare data using subsets of opposition activists (identify peaks and put data together)
peak_data_sets <- list()
reg_data_opp_sets <- list()
for (i in 1:length(opsets)) {
  cat(i, '\n')
  agg_op_sets[[i]]$date <- as.Date(agg_op_sets[[i]]$date)
  agg_op_sets[[i]]$peak5 <- ifelse(agg_op_sets[[i]]$opsum/agg_op_sets[[i]]$opsum_movmed >= 5, 1, 0)
  cat(table(agg_op_sets[[i]]$peak5), '\n')
  peak_data_sets[[i]] <- subset(agg_op_sets[[i]], select = c(date, opsum, peak5))
  peak_data_sets[[i]] <- peak_data_sets[[i]][order(peak_data_sets[[i]]$date, decreasing = F),]
  peak_data_sets[[i]]$date <- as.Date(peak_data_sets[[i]]$date)
  rownames(peak_data_sets[[i]]) <- NULL
  reg_data_opp_sets[[i]] <- make_data_reg(data = data_merged$main, placebo_dates = placebo_dates, opposition_data = peak_data_sets[[i]])
}

##### Estimate models
# Volume (Fig.19ap) time: 6h 
models_opp_subsets_ntw <- list()

for (i in 1:length(reg_data_opp_sets)) {
  new_data <- setup_data_analysis(data = reg_data_opp_sets[[i]])
  models_opp_subsets_ntw[[i]] <- glmer(ntw ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                                      data = new_data, family = 'poisson',
                                      control = glmerControl(optimizer = 'optimx', optCtrl  = list(method="nlminb"), 
                                                             check.conv.grad = .makeCC("warning", tol = 7e-3)))
  
  rm(new_data)
  gc()
}

# Retweet diversity (Fig.20ap) time: 3h
models_opp_subsets_rtd <- list()

for (i in 1:length(reg_data_opp_sets)) {
  new_data <- setup_data_analysis(data = reg_data_opp_sets[[i]])
  models_opp_subsets_rtd[[i]] <- glmer(rtd ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                                       data = new_data, family = 'poisson',
                                       control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))
  
  rm(new_data)
  gc()
}

# Cheerleading (Fig.21ap) time: 2.5h
models_opp_subsets_put <- list()

for (i in 1:length(reg_data_opp_sets)) {
  new_data <- setup_data_analysis(data = reg_data_opp_sets[[i]])
  models_opp_subsets_put[[i]] <- glmer(put_sum ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                                       data = new_data, family = 'poisson',
                                       control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))
  
  rm(new_data)
  gc()
}

# Negative campaigning (Fig.22ap) time: 2.5h
models_opp_subsets_nav <- list()

for (i in 1:length(reg_data_opp_sets)) {
  new_data <- setup_data_analysis(data = reg_data_opp_sets[[i]])
  models_opp_subsets_nav[[i]] <- glmer(nav_sum ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                                       data = new_data, family = 'poisson',
                                       control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))
  
  rm(new_data)
  gc()
}

##### Compute SAPD
# The input datasets were produced above in this section.
# For convenience, they are also provided and can be loaded with the following commands:
# models_opp_subsets_ntw <- get( load('models_opp_subsets_ntw.RData') )
# models_opp_subsets_rtd <- get( load('models_opp_subsets_rtd.RData') )
# models_opp_subsets_put <- get( load('models_opp_subsets_put.RData') )
# models_opp_subsets_nav <- get( load('models_opp_subsets_nav.RData') )
# models_main <- get( load('models_main.RData') )
# reg_data_main <- get( load('reg_data_main.RData') )
# reg_data_opp_sets <- get( load('reg_data_opp_sets.RData') )

# Restructure all results and data into 2 lists
names(models_opp_subsets_ntw) <- paste0('ntw_subset', 1:10)
names(models_opp_subsets_rtd) <- paste0('rtd_subset', 1:10)
names(models_opp_subsets_put) <- paste0('put_subset', 1:10)
names(models_opp_subsets_nav) <- paste0('nav_subset', 1:10)

models <- do.call(c, list(models_main, models_opp_subsets_ntw, models_opp_subsets_rtd, models_opp_subsets_put, models_opp_subsets_nav))

reg_data_main <- list('main' = reg_data_main)
reg_data_opp_sets <- get( load('reg_data_opp_sets.RData') )
reg_data_opp_sets <- lapply(reg_data_opp_sets, function(k) setup_data_analysis(data = k) )
names(reg_data_opp_sets) <- paste0('subset', 1:10)
reg_datasets <- do.call(c, list(reg_data_main, reg_data_opp_sets))

# Compute SAPD and store the results for ggplotting
sapd_data_opp_sets <- compute_sapd(models = models, reg_datasets = reg_datasets, BONFE_NUM = TOTAL_NUM_MODELS)
datplot_opp_sets <- lapply(names(sapd_data_opp_sets), function(k) sapd_data_opp_sets[[k]]$datplot)
datplot_opp_sets <- do.call('rbind', datplot_opp_sets)

##### Make Fig 19ap - 22ap
# The input datasets were produced above in this section.
# For convenience, they are also provided and can be loaded with the following commands:
# datplot_opp_sets <- read.table(file = 'datplot_opp_sets.txt', header = T, sep = '\t')

datplot_opp_sets <- prepare_data_opp_sets(data = datplot_opp_sets)

fig19ap <- make_quick_plot_opp(data = datplot_opp_sets$ntw, xmax = 1.5)
fig20ap <- make_quick_plot_opp(data = datplot_opp_sets$rtd, xmax = 1.5)
fig21ap <- make_quick_plot_opp(data = datplot_opp_sets$put, xmax = 1)
fig22ap <- make_quick_plot_opp(data = datplot_opp_sets$nav, xmax = 0.5)

#--------------- SECTION 8: Appendix L 'Sentiment analysis' (Fig. 23ap - 24ap)
# The input dataset was produced above in Section 2. 
# For convenience, it is also provided and can be loaded with the following command: reg_data_sent <- get( load('reg_data_sent.RData') )

##### Estimate models
reg_data_sent <- setup_data_analysis(data = reg_data_sent)

models_sent <- list()

models_sent$put_sent <- glmer(put_sum_posrusentilex ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                         data = reg_data_sent, family = 'poisson',
                         control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))

models_sent$put_sent_dif <- glmer(put_sum_negrusentilex ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                             data = reg_data_sent, family = 'poisson',
                             control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))

models_sent$nav_sent <- glmer(nav_sum_negrusentilex ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                         data = reg_data_sent, family = 'poisson',
                         control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))

models_sent$nav_sent_dif <- glmer(nav_sum_posrusentilex ~ protest_var + peak_var + placebo_var + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), 
                             data = reg_data_sent, family = 'poisson',
                             control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3)))

##### Compute SAPD for Fig. 23ap-24ap
# The input datasets were produced above in Section 2. 
# For convenience, they are also provided and can be loaded with the following commands: 
# reg_data_sent <- get( load('reg_data_sent.RData') )
# reg_data_main <- get( load('reg_data_main.RData') )
# models_main <- get( load('models_main.RData') )
# models_sent <- get( load('models_sent.RData') )

models <- do.call(c, list(models_main, models_sent))
models$ntw <- NULL
models$rtd <- NULL

reg_data_main <- list('main' = reg_data_main)
if (! 'id_num' %in% reg_data_sent) {
  reg_data_sent <- setup_data_analysis(data = reg_data_sent) # add user_id
}
reg_data_sent <- list('sent' = reg_data_sent)
reg_datasets <- do.call(c, list(reg_data_main, reg_data_sent))

sapd_data <- compute_sapd(models = models, reg_datasets = reg_datasets, BONFE_NUM = TOTAL_NUM_MODELS) # 3 mins

datplot_sent <- lapply(names(sapd_data), function(k) sapd_data[[k]]$datplot)
datplot_sent <- do.call('rbind', datplot_sent)

##### Make Fig. 23ap and 24ap
# The input dataset was produced above. 
# For convenience, it is also provided and can be loaded with the following command: datplot_sent <- read.table(file = 'datplot_sent.txt', header = T, sep = '\t')

fig23ap <- make_quick_plot_sent(data = make_data_for_sent(data = datplot_sent, block = 'put'), title = '', xaxis_label = NULL, xmax = 1) 
fig24ap <- make_quick_plot_sent(data = make_data_for_sent(data = datplot_sent, block = 'nav'), title = '', xaxis_label = NULL, xmax = 0.5) 

##### SECTION 8: Appendix M 'Retweet Diversity' (Fig. 25)
# The input dataset was produced above in Section 3 (reg_data_main)
# For convenience, it is also provided and can be loaded with the following command:
# reg_data_main <- get( load('reg_data_main.RData') )
# datplot was produced above (line 224) and can be loaded using the following command 
# datplot <- get( load('datplot.RData') )

reg_data_main <- list('main' = reg_data_main)
models_appm <- list()
models_appm$model_app_m <- glmer(rtd ~ protest_var + peak_var + placebo_var + ntw + (1 + protest_var + peak_var + placebo_var|id_num) + (1|month), data = reg_data_main$main, family = 'poisson', control = glmerControl(optimizer = 'bobyqa', check.conv.grad = .makeCC("warning", tol = 7e-3))) # 60min
sapd_data <- compute_sapd(models = models_appm, reg_datasets = reg_data_main, BONFE_NUM = TOTAL_NUM_MODELS) # 30 min

datplot_new <- lapply(names(sapd_data), function(k) sapd_data[[k]]$datplot)
datplot <- rbind(datplot, datplot_new[[1]])

datplot_app_m <- make_data_for_plot_app_m(data = datplot)
fig25ap <- make_quick_plot_app_m(data = datplot_app_m)

